#Goal Determine how consistent enrichment is between biological and technical replicates of mouse OBs. Determine the range of enrichment fold for all ORs.

setup and load data

library(tidyverse)
library(heatmaply)

rawdat <- read_csv("~/Desktop/obmap/r_analysis/capture_enrichment/obmap_enrichmenteffect_gen25.csv") %>%
  filter(!is.na(gene_name)) %>%
  mutate(isOR = str_detect(gene_name, "Olfr"),
         isPS = str_detect(gene_name, "-ps"),
         isTaar = str_detect(gene_name, "Taar"),
         isORTaar = isOR + isTaar)

ectopic <- c("Olfr287","Olfr32")
olfrs_nops <- rawdat %>% filter(isOR == T) %>% filter(isPS == F)
olfrs <- olfrs_nops %>% filter(!(gene_name %in% ectopic)) %>% select(-ens_raw, -ens_id)
olfrstaars <- rawdat %>% 
  filter(isORTaar == T) %>% 
  filter(isPS == F) %>% 
  filter(!(gene_name %in% ectopic)) %>%
  select(-ens_raw, ens_id)

check fold enrichment

olfrs %>% ggplot() +
  geom_point(aes(M6nocapOB, M6OB))

print("M6 stats")
## [1] "M6 stats"
#number of ORs with reads
sum(length(which(olfrs$M6nocapOB > 0)))
## [1] 412
sum(length(which(olfrs$M6OB > 0)))
## [1] 844
mean(olfrs$M6nocapOB)
## [1] 0.0637276
mean(olfrs$M6OB)
## [1] 360.2719
median(olfrs$M6nocapOB)
## [1] 0
median(olfrs$M6OB)
## [1] 106.735
olfrs %>% ggplot() +
  geom_point(aes(M4OBnocap, M4OBcap))

#number of ORs with reads
print("M4 stats")
## [1] "M4 stats"
sum(length(which(olfrs$M4OBnocap > 0)))
## [1] 496
sum(length(which(olfrs$M4OBcap > 0)))
## [1] 992
mean(olfrs$M4OBnocap)
## [1] 0.1499821
mean(olfrs$M4OBcap)
## [1] 61.52858
median(olfrs$M4OBnocap)
## [1] 0
median(olfrs$M4OBcap)
## [1] 19.665
m6compare <- tibble(Olfrname = olfrs$gene_name,
                    m6nocap = olfrs$M6nocapOB,
                    m6cap = olfrs$M6OB) %>%
  mutate(enr = (m6cap + 1)/(m6nocap + 1))

# more abundant ORs were enriched more
ggplot(m6compare, aes(m6cap, enr)) +
  geom_point()

biological replicates

bioreps <- olfrs %>% select(contains("OB")) %>% select(-M6nocapOB, -M4OBnocap)
bioreps <- olfrs %>% 
  select(M4OB, M14XOB, M13XOB, M12OB, 
         M16XOB, M9OBEC, M8OB, M10ECOB) 
# %>%
#   rowwise() %>% 
#   mutate(tots = sum(M4OB, M14XOB, M13XOB, M12OB, 
#                     M16XOB, M9OBEC, M8OB, M10ECOB)) %>%
#   filter(tots != 0) %>%
#   select(-tots)

#pairwise spearman correlation matrix
bio_pmtx <- matrix(data=0, nrow=ncol(bioreps), ncol=ncol(bioreps))
colnames(bio_pmtx) <- colnames(bioreps)
rownames(bio_pmtx) <- colnames(bioreps)
for (i in 1:ncol(bioreps)) {
  correlation <- vector("numeric", length = ncol(bioreps))
  for (j in 1:ncol(bioreps)) {
    if (i == j) {
      bio_pmtx[i,j] <- 1
    } else {
    bio_pmtx[i,j] <- cor(bioreps[i], bioreps[j], method = "s")
    }
  }
}

#calc mean cor for pairwise 
r1 <- sum(bio_pmtx[1,2:8])
r2 <- sum(bio_pmtx[1,3:8])
r3 <- sum(bio_pmtx[1,4:8])
r4 <- sum(bio_pmtx[1,5:8])
r5 <- sum(bio_pmtx[1,6:8])
r6 <- sum(bio_pmtx[1,7:8])
r7 <- sum(bio_pmtx[1,8])
sum(r1,r2,r3,r4,r5,r6,r7)/sum(7+6+5+4+3+2+1)
## [1] 0.7581894
heatmaply_cor(bio_pmtx)
ggplot(bioreps, aes(M4OB, M14XOB)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = "loess")
## `geom_smooth()` using formula 'y ~ x'

technical replicates of capture enrichment (2 different arrays on same cDNA, same library)

techreps <- olfrs %>% select(NC_10Ar2, NC_10Br2, NC_4Ar2, NC_4Br2, OC_10Ar2, OC_10Br2, OC_4Ar2, OC_4Br2)

one <- cor(techreps$NC_10Ar2, techreps$OC_10Ar2, method = "spearman")
two <- cor(techreps$NC_10Br2, techreps$OC_10Br2, method = "spearman")
three <- cor(techreps$NC_4Ar2, techreps$OC_4Ar2, method = "spearman")
four <- cor(techreps$NC_4Br2, techreps$OC_4Br2, method = "spearman")
mean(c(one, two, three, four))
## [1] 0.9508835
#pairwise spearman correlation matrix
tech_pmtx <- matrix(data=0, nrow=ncol(techreps), ncol=ncol(techreps))
colnames(tech_pmtx) <- colnames(techreps)
rownames(tech_pmtx) <- colnames(techreps)
for (i in 1:ncol(techreps)) {
  correlation <- vector("numeric", length = ncol(techreps))
  for (j in 1:ncol(techreps)) {
    if (i == j) {
      tech_pmtx[i,j] <- 1
    } else {
    tech_pmtx[i,j] <- cor(techreps[i], techreps[j], method = "spearman")
    }
  }
}

heatmaply_cor(tech_pmtx)
ggplot(olfrs, aes(log2(M4OB + 1), log2(M4OBcap + 1))) + 
  geom_point(alpha = 0.25) + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(olfrs, aes(log2(NC_10Ar2 + 1), log2(OC_10Ar2 + 1))) + 
  geom_point(alpha = 0.25) + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(olfrs, aes(NC_10Ar2, OC_10Ar2)) + 
  geom_point(alpha = 0.25) + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

#library(edgeR)
#maPlot(olfrs$NC_10Ar2, olfrs$OC_10Ar2)
#maPlot(rawdat$NC_10Ar2, rawdat$OC_10Ar2)

ma plots

library(edgeR)

#best technical replicate - all genes
maPlot(rawdat$NC_10Ar2, rawdat$OC_10Ar2)

#best technical replicate - just ORs
maPlot(olfrs$NC_10Ar2, olfrs$OC_10Ar2)

#best biological replicate - just ORs
maPlot(olfrs$M4OB, olfrs$M14XOB)

#use m6nocap and m6cap samples (1-24, OB) to determine which ORs dropout

m6samples <- olfrstaars %>% select(gene_name, contains("M6"))

m6nocap <- m6samples %>% select(gene_name, contains("nocap"))
m6nocapSum <- m6nocap %>% 
  select(-gene_name, -M6nocapOB) %>% 
  mutate(noCapSum = rowSums(.)) %>% 
  pull(noCapSum) 

m6cap <- m6samples %>% 
  select(gene_name, !contains("nocap")) %>% 
  select(gene_name, contains("S"), M6OB)
m6capSum <- m6cap %>% 
  select(-gene_name, -M6OB) %>% 
  mutate(capSum = rowSums(.)) %>% 
  pull(capSum)

m6compare <- tibble(gene = m6samples$gene_name,
                    noCap = m6nocapSum,
                    Cap = m6capSum,
                    noCapOB = m6samples$M6nocapOB,
                    CapOB = m6samples$M6OB) %>%
  mutate(secComp = ifelse(Cap > noCap, 
                          100, ifelse(Cap == noCap,
                                    10, 0)),
         obComp = ifelse(CapOB > noCapOB, 
                         100, ifelse(CapOB == noCapOB,
                                   10, 0)),
         accord = secComp + obComp,
         status = ifelse(accord == 200, "Enriched", 
                    ifelse(accord == 110, "Likely_enriched", 
                      ifelse(accord == 100, "Variable", 
                        ifelse(accord == 20, "Even", 
                          ifelse(accord < 20, "Dropout", NA))))),
         enriched = str_detect(tolower(status), "enriched"))

notEnriched <- m6compare %>% filter(enriched == F)
notEnrORs <- notEnriched %>% pull(gene)
paste0("There are ", length(notEnrORs), " non-enriched ORs")
## [1] "There are 17 non-enriched ORs"
ggplot(notEnriched) +
  geom_point(aes(noCap, Cap, color = enriched))

ggplot(notEnriched) +
  geom_point(aes(noCapOB, CapOB, color = enriched))

fold enrichment using genomic DNA controls

b6gen samples not included in the updated set of gencode v25 samples due to being 25bp PE

b6dna <- olfrs %>% select(gene_id, b6cap100, b6nocap7, b6nocap9, m4cap, m4uncap, M6OB, M6nocapOB) %>% mutate(foldenr7 = b6cap100/(b6nocap7 + 0.1), foldenrm4 = m4cap/(m4uncap + 0.1), foldenrm6 = M6OB/(M6nocapOB + 0.1)) %>% arrange(desc(foldenrm6)) %>% mutate(order = factor(gene_id, gene_id))

ggplot(b6dna, aes(b6nocap7, b6cap100)) +
  geom_point()

ggplot(b6dna, aes(m4uncap, m4cap)) +
  geom_point()

ggplot(b6dna, aes(M6nocapOB, M6OB)) +
  geom_point()

ggplot(b6dna, aes(order, foldenrm6)) +
  geom_bar(stat="identity") +
  ggtitle("M6OB RNA fold enrichment")

ggplot(b6dna, aes(foldenrm4, foldenrm6)) + 
  geom_point() +
  geom_smooth(method = "lm")

ggplot(b6dna, aes(m4cap, M6OB)) + 
  geom_point() +
  geom_smooth(method = "lm")

ggplot(b6dna, aes(m4uncap, M6nocapOB)) + 
  geom_point() +
  geom_smooth(method = "lm")